In [3]:
import pandas as pd

In [7]:
# DataFrame 읽기

df = pd.read_csv('example_df.csv')
df.tail()


Out[7]:
loan_status loan_amnt term int_rate grade annual_inc
1995 0 8000 36 11.58 2 30992.0
1996 1 6700 36 14.11 4 42050.0
1997 1 3000 36 9.63 1 30500.0
1998 1 6000 36 9.63 1 62000.0
1999 1 2500 36 11.58 2 95800.0

In [13]:
# Column 이름 지정
pd.read_csv('example_df.csv',
            names=['완납여부', '대출액', '상환기간', '연이율', '신용등급', '연수입']).tail()


Out[13]:
완납여부 대출액 상환기간 연이율 신용등급 연수입
1996 0 8000 36 11.58 2 30992
1997 1 6700 36 14.11 4 42050
1998 1 3000 36 9.63 1 30500
1999 1 6000 36 9.63 1 62000
2000 1 2500 36 11.58 2 95800

In [15]:
# Column을 Index
pd.read_csv('example_df.csv',
            index_col=['loan_status', 'term']).tail()


Out[15]:
loan_amnt int_rate grade annual_inc
loan_status term
0 36 8000 11.58 2 30992.0
1 36 6700 14.11 4 42050.0
36 3000 9.63 1 30500.0
36 6000 9.63 1 62000.0
36 2500 11.58 2 95800.0

In [19]:
# 구분자 지정
pd.read_csv('example_df.csv',
            sep='\t').tail()


Out[19]:
loan_status,loan_amnt,term,int_rate,grade,annual_inc
1995 0,8000,36,11.58,2,30992
1996 1,6700,36,14.11,4,42050
1997 1,3000,36,9.63,1,30500
1998 1,6000,36,9.63,1,62000
1999 1,2500,36,11.58,2,95800
  • \d - 숫자
  • \s - 공백(whitespace)
  • \w - 문자+숫자(alphanumeric)
  • \t - tab
  • \n - 개행문자

In [22]:
# 특정값 NA로 취급

na_val = {'term' : [36]}
pd.read_csv('example_df.csv', na_values=na_val).head()


Out[22]:
loan_status loan_amnt term int_rate grade annual_inc
0 1 7500 NaN 13.75 5 22000.0
1 1 3500 NaN 10.28 3 20000.0
2 1 5750 NaN 7.43 1 125000.0
3 1 5000 NaN 7.43 1 40000.0
4 1 1200 NaN 11.54 3 20000.0

In [23]:
# 행 생략
df.head()


Out[23]:
loan_status loan_amnt term int_rate grade annual_inc
0 1 7500 36 13.75 5 22000.0
1 1 3500 36 10.28 3 20000.0
2 1 5750 36 7.43 1 125000.0
3 1 5000 36 7.43 1 40000.0
4 1 1200 36 11.54 3 20000.0

In [24]:
pd.read_csv('example_df.csv', skiprows=[1, 2]).head()


Out[24]:
loan_status loan_amnt term int_rate grade annual_inc
0 1 5750 36 7.43 1 125000.0
1 1 5000 36 7.43 1 40000.0
2 1 1200 36 11.54 3 20000.0
3 1 12250 36 10.59 3 60000.0
4 1 2700 36 15.96 6 52200.0

In [31]:
# 일부 행만 읽기
df_output = pd.read_csv('example_df.csv', nrows=3)
df_output


Out[31]:
loan_status loan_amnt term int_rate grade annual_inc
0 1 7500 36 13.75 5 22000
1 1 3500 36 10.28 3 20000
2 1 5750 36 7.43 1 125000

In [33]:
# file output
df_output.to_csv('df_output.csv', index=False, header=False)

In [35]:
# 인터넷 csv파일 읽기
ip_data = pd.read_csv('https://r-forge.r-project.org/scm/viewvc.php/*checkout*/pkg/fBasics/data/IP.dat.csv?revision=1&root=rmetrics&pathrev=1', sep=';')
ip_data


Out[35]:
%d-%b-%Y IP
0 28-Jan-1919 7.628
1 28-Feb-1919 7.291
2 28-Mar-1919 7.080
3 28-Apr-1919 7.206
4 28-May-1919 7.249
5 28-Jun-1919 7.712
6 28-Jul-1919 8.176
7 28-Aug-1919 8.302
8 28-Sep-1919 8.134
9 28-Oct-1919 8.049
10 28-Nov-1919 7.923
11 28-Dec-1919 8.049
12 28-Jan-1920 8.808
13 28-Feb-1920 8.808
14 28-Mar-1920 8.639
15 28-Apr-1920 8.176
16 28-May-1920 8.386
17 28-Jun-1920 8.471
18 28-Jul-1920 8.260
19 28-Aug-1920 8.302
20 28-Sep-1920 8.007
21 28-Oct-1920 7.670
22 28-Nov-1920 7.038
23 28-Dec-1920 6.616
24 28-Jan-1921 6.237
25 28-Feb-1921 6.111
26 28-Mar-1921 5.942
27 28-Apr-1921 5.942
28 28-May-1921 6.111
29 28-Jun-1921 6.069
... ... ...
965 28-Jun-1999 138.787
966 28-Jul-1999 139.555
967 28-Aug-1999 140.249
968 28-Sep-1999 140.277
969 28-Oct-1999 141.335
970 28-Nov-1999 141.895
971 28-Dec-1999 142.856
972 28-Jan-2000 143.173
973 28-Feb-2000 144.043
974 28-Mar-2000 144.882
975 28-Apr-2000 145.636
976 28-May-2000 146.617
977 28-Jun-2000 147.188
978 28-Jul-2000 146.532
979 28-Aug-2000 146.700
980 28-Sep-2000 146.826
981 28-Oct-2000 146.266
982 28-Nov-2000 145.789
983 28-Dec-2000 145.135
984 28-Jan-2001 143.934
985 28-Feb-2001 143.509
986 28-Mar-2001 142.928
987 28-Apr-2001 142.007
988 28-May-2001 141.595
989 28-Jun-2001 140.326
990 28-Jul-2001 140.402
991 28-Aug-2001 139.954
992 28-Sep-2001 138.821
993 28-Oct-2001 137.551
994 28-Nov-2001 137.139

995 rows × 2 columns

인터넷 DB 읽기

  • Yahoo! Finance
  • Google Finance
  • St.Louis FED (FRED)
  • Kenneth French's data library
  • World Bank
  • Google Analytics
  • pip install pandas_datareader

In [37]:
import pandas_datareader.data as web
import datetime

start = datetime.datetime(2016, 1, 1)
end = datetime.datetime(2016, 12, 31)
print(start, end)


2016-01-01 00:00:00 2016-12-31 00:00:00

In [38]:
df = web.DataReader('005930.KS', 'yahoo', start, end)
df.tail()


Out[38]:
Open High Low Close Volume Adj Close
Date
2016-12-26 1780000.0 1800000.0 1778000.0 1798000.0 96400 1770515.29
2016-12-27 1799000.0 1810000.0 1793000.0 1799000.0 93000 1771500.01
2016-12-28 1792000.0 1799000.0 1780000.0 1788000.0 133200 1788000.00
2016-12-29 1771000.0 1802000.0 1770000.0 1802000.0 150300 1802000.00
2016-12-30 1802000.0 1802000.0 1802000.0 1802000.0 0 1802000.00

In [39]:
df = web.DataReader('KRX:005930', 'google', start, end)
df.tail()


Out[39]:
Open High Low Close Volume
Date
2016-12-23 1801000.0 1804000.0 1780000.0 1782000.0 162173
2016-12-26 1780000.0 1800000.0 1778000.0 1798000.0 96051
2016-12-27 1799000.0 1810000.0 1793000.0 1799000.0 91981
2016-12-28 1792000.0 1799000.0 1780000.0 1788000.0 132355
2016-12-29 1771000.0 1802000.0 1770000.0 1802000.0 139768

In [40]:
df = web.DataReader('GDP', 'fred', start, end)
df


Out[40]:
GDP
DATE
2016-01-01 18281.6
2016-04-01 18450.1
2016-07-01 18675.3
2016-10-01 18869.4